Final Project: Phase 2 - EDA¶
Spring 2024
Group: Michael Massone and Joseph Nelson Farrell
DS 5230 Unsupervised Machine Learning
Professor Steven Morin, PhD
Due: 03/11/2024
Libraries¶
In [ ]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import os
import sys
from pathlib import Path
from sklearn.preprocessing import LabelEncoder
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
import warnings
Define File Paths¶
In [ ]:
# define path
nb_path = Path(os.getcwd())
print(nb_path)
path = str(nb_path.parent)
print(path)
# path to figs folder
figs_path = path + '/figs'
# path to data
data_path= path + '/data'
# path to src folder
src_path = path + '/src'
print(src_path)
# sys path
sys.path.append(src_path)
/Users/nelsonfarrell/Documents/Northeastern/5230/final_project/DS5230-final/notebooks /Users/nelsonfarrell/Documents/Northeastern/5230/final_project/DS5230-final /Users/nelsonfarrell/Documents/Northeastern/5230/final_project/DS5230-final/src
Functions¶
In [ ]:
from preprocessing_eda_utils import generate_column_hist
from preprocessing_eda_utils import sub_divide_pairplot
Parameters¶
In [ ]:
# transformed data csv file name
data_file = "/curated/trans_data_design.csv"
Load Data¶
In [ ]:
# read in design matrix
trans_df = pd.read_csv( data_path + data_file )
Exploratory Data Analysis¶
In [ ]:
# generate attributes list
attr_list = trans_df.columns
# display machine learning attributes list
print(f'Attributes List:')
for i, j in enumerate(attr_list):
print(f'{i + 1}: {j}')
Attributes List: 1: numerical__Area 2: numerical__Perimeter 3: numerical__MajorAxisLength 4: numerical__MinorAxisLength 5: numerical__AspectRation 6: numerical__Eccentricity 7: numerical__ConvexArea 8: numerical__EquivDiameter 9: numerical__Extent 10: numerical__Solidity 11: numerical__roundness 12: numerical__Compactness 13: numerical__ShapeFactor1 14: numerical__ShapeFactor2 15: numerical__ShapeFactor3 16: numerical__ShapeFactor4
In [ ]:
# display dimension of transformed dataframe
print(f'Transformed Dataframe Dimensions:', trans_df[attr_list].shape)
Transformed Dataframe Dimensions: (13611, 16)
In [ ]:
# visually inspect transformed df
display(trans_df.head())
| numerical__Area | numerical__Perimeter | numerical__MajorAxisLength | numerical__MinorAxisLength | numerical__AspectRation | numerical__Eccentricity | numerical__ConvexArea | numerical__EquivDiameter | numerical__Extent | numerical__Solidity | numerical__roundness | numerical__Compactness | numerical__ShapeFactor1 | numerical__ShapeFactor2 | numerical__ShapeFactor3 | numerical__ShapeFactor4 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -0.840749 | -1.143319 | -1.306598 | -0.631153 | -1.565053 | -2.185720 | -0.841451 | -1.063341 | 0.289087 | 0.367613 | 1.423867 | 1.839116 | 0.680786 | 2.402173 | 1.925723 | 0.838371 |
| 1 | -0.829188 | -1.013924 | -1.395911 | -0.434445 | -1.969784 | -3.686040 | -0.826102 | -1.044217 | 0.697477 | -0.462907 | 0.231054 | 2.495449 | 0.367967 | 3.100893 | 2.689702 | 0.771138 |
| 2 | -0.807157 | -1.078829 | -1.252357 | -0.585735 | -1.514291 | -2.045336 | -0.808704 | -1.008084 | 0.578195 | 0.518417 | 1.252865 | 1.764843 | 0.603129 | 2.235091 | 1.841356 | 0.916755 |
| 3 | -0.785741 | -0.977215 | -1.278825 | -0.439290 | -1.741618 | -2.742211 | -0.773975 | -0.973337 | 0.671260 | -2.241767 | 0.515049 | 2.081715 | 0.401718 | 2.515075 | 2.204250 | -0.197985 |
| 4 | -0.781239 | -1.097384 | -1.380471 | -0.266663 | -2.117993 | -4.535028 | -0.784286 | -0.966080 | 0.476020 | 0.804772 | 1.874992 | 2.765330 | 0.118268 | 3.270983 | 3.013462 | 0.939640 |
In [ ]:
# display transformed df general information
trans_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 13611 entries, 0 to 13610 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 numerical__Area 13611 non-null float64 1 numerical__Perimeter 13611 non-null float64 2 numerical__MajorAxisLength 13611 non-null float64 3 numerical__MinorAxisLength 13611 non-null float64 4 numerical__AspectRation 13611 non-null float64 5 numerical__Eccentricity 13611 non-null float64 6 numerical__ConvexArea 13611 non-null float64 7 numerical__EquivDiameter 13611 non-null float64 8 numerical__Extent 13611 non-null float64 9 numerical__Solidity 13611 non-null float64 10 numerical__roundness 13611 non-null float64 11 numerical__Compactness 13611 non-null float64 12 numerical__ShapeFactor1 13611 non-null float64 13 numerical__ShapeFactor2 13611 non-null float64 14 numerical__ShapeFactor3 13611 non-null float64 15 numerical__ShapeFactor4 13611 non-null float64 dtypes: float64(16) memory usage: 1.7 MB
In [ ]:
# display counts of NA, None, and np.nan
print('\nNA (np.nan or None) Count:\n',
trans_df.isna().sum(), sep ='')
NA (np.nan or None) Count: numerical__Area 0 numerical__Perimeter 0 numerical__MajorAxisLength 0 numerical__MinorAxisLength 0 numerical__AspectRation 0 numerical__Eccentricity 0 numerical__ConvexArea 0 numerical__EquivDiameter 0 numerical__Extent 0 numerical__Solidity 0 numerical__roundness 0 numerical__Compactness 0 numerical__ShapeFactor1 0 numerical__ShapeFactor2 0 numerical__ShapeFactor3 0 numerical__ShapeFactor4 0 dtype: int64
In [ ]:
# display proportion of NA, None, and np.nan
print('\nNA (np.nan or None) Ratio:\n',
trans_df.isna().sum() / trans_df.shape[0], sep='')
NA (np.nan or None) Ratio: numerical__Area 0.0 numerical__Perimeter 0.0 numerical__MajorAxisLength 0.0 numerical__MinorAxisLength 0.0 numerical__AspectRation 0.0 numerical__Eccentricity 0.0 numerical__ConvexArea 0.0 numerical__EquivDiameter 0.0 numerical__Extent 0.0 numerical__Solidity 0.0 numerical__roundness 0.0 numerical__Compactness 0.0 numerical__ShapeFactor1 0.0 numerical__ShapeFactor2 0.0 numerical__ShapeFactor3 0.0 numerical__ShapeFactor4 0.0 dtype: float64
Pairplots¶
In [ ]:
# ignore warnings
warnings.filterwarnings('ignore', category = FutureWarning)
# generate plot
plot = sns.pairplot(data = trans_df)
plot.fig.suptitle('Attribute Pairplots', fontsize = 60, weight = 'bold', style = "italic", y = 1.03)
plt.tight_layout()
# save fig
plt.savefig(figs_path + f"/pairplot_full.png", bbox_inches = 'tight')
In [ ]:
sub_divide_pairplot(trans_df)
Histograms of Numerical Columns¶
In [ ]:
# divide numeric columns into groups of 4 for plotting
divided_columns = [attr_list[i:i+4] for i in range(0, len(attr_list), 4)]
# generate plots
for i, cols in enumerate(divided_columns):
generate_column_hist(trans_df, cols)
# save fig
plt.savefig(figs_path + f"/attribute_hist_{i}.png", bbox_inches = 'tight')